#!/usr/lib/python
# coding=utf-8

import urllib2
from bs4 import BeautifulSoup
import json
import MySQLdb
import base64

# *******************(1)读取url信息，打开页面，将页面数据返回************************
def OpenPage(url):
    '''
    打开就业信息的url获取服务器的响应
    '''
    Myheaders = {}
    req = urllib2.Request(url, headers=Myheaders)
    f = urllib2.urlopen(req)
    data = f.read()
    return data

def Test1():
    '''
测试之后发现，编码属于utf-8不用进行解码
并且信息不完整，没有就业单位信息
这是因为，采用了异步请求,我们需要二次获取
在主页中F12查找NetWork ，刷新之后，发现本次总共访问的请求
在最后几个xhr就为动态加载的ajax请求
下面部分属于动态变化的，我们使用的url只是访问了一次，只把固定的部分爬取下来了
我们分析爬取下来的页面的html信息后，发现有如下部分信息
//单位行业
$.ajax({
这里的参数我们发现并不是一个url，这时因为这里为了安全策略，采用一种键值对的形式来进行访问

url: getCommonDataUrl('GetCompanyTypes'),
type: 'GET',
async: true,
data: "",
dataType: 'json',
success: function (ajaxResult) {
if (ajaxResult) {
    $('#SearchCompanyType').empty();
    $('#SearchCompanyType').append("<option value=>--请选择--</option>");
    $.each(ajaxResult, function (index, obj) {
    $('#SearchCompanyType').append("<option value=" + obj.value + ">" + obj.text + "</option>");

        });
      }
    }
});

    '''
    print OpenPage("http://jy.51uns.com:8022/Pro_StudentEmploy/StudentJobFair/Zhaoping.aspx?WorkType0")

def Test1_1():
    '''
    http://jy.51uns.com:8022/Frame/Data/jdp.ashx?rnd=1534034231300&fn=GetZhaopinList&StartDate=2000-01-01&SearchKey=&InfoType=-1&CompanyAttr=&CompanyType=&Area=&City=&CompanyProvice=&Post=&Zhuanye=&XLkey=&Age=&start=0&limit=15&DateType=999&InfoState=1&WorkType=0&CompanyKey=
    我们发现抓取下来的信息是该页面内的招聘详情（15个,刚好为一页有15个的公司的信息limite字段限制了）中有一个Id字段，当我们点开一个具体的详情发现
    "Id":"b7986ae4f3f94e0bb79677c68b871e30"这里取了第一个
    http://iy.51nns.com:8022/Pro_StudentEmploy/StudentJobFair/Zhaoping_Detail.aspx?JobId=b7986ae4f3f94e0bb79677c68b871e30
    也有一个id字段,发现每一个都是相互对应的
    我们分析处每一个具体的招聘详情都是根据id字段来进行区分的，然而这个id可以根据我们自己在异步ajax请求中可以查找出来
    就像上面的我们发现将limite字段修改之后，就可以将所有的id获取到，这样就可以根据拼接的url进行访问每一个招聘信息的页面
    '''
    print OpenPage("http://jy.51uns.com:8022/Frame/Data/jdp.ashx?rnd=1534034231300&fn=GetZhaopinList&StartDate=2000-01-01&SearchKey=&InfoType=-1&CompanyAttr=&CompanyType=&Area=&City=&CompanyProvice=&Post=&Zhuanye=&XLkey=&Age=&start=0&limit=15&DateType=999&InfoState=1&WorkType=0&CompanyKey=")

# ****************************(2_1)构建html展示页面*************************************
def write_html(file_name, i):
    '''
    构建html展示页面
    每个页面包含22个公司信息，可以跳转都公司页面的详情
    '''

# *******************(2)分析异步加载的各个url信息******************************
def ParseMainPage(page):
    '''
    取出每个公司详情页面的url
    '''
    # rows 为一个保存了多个招聘信息的list
    # list 中每一个元素都是一条招聘信息，保存为字典类型
    # prefix = "http://jy.51uns.com:8022/Pro_StudentEmploy/StudentJobFair/Zhaoping_Detail.aspx?JobId="
    # prefix = "http://jy.51uns.com:8022/Frame/Data/jdp.ashx?rnd=1534164775674&fn=GetZhaopinList&StartDate=2000-01-01&JobId="
    # prefix = "http://jy.51uns.com:8022/Frame/Data/jdp.ashx?rnd=1534220883227&fn=GetOneZhaopin&StartDate=2000-01-01&JobId="

    data = json.loads(page)
    #  data 为大的字典，rows是招聘的信息数据
    rows = data["rows"]
    prefix = "http://jy.51uns.com:8022/Frame/Data/jdp.ashx?rnd=1534220883227&fn=GetOneZhaopin&JobId="
    tail = "&StartDate=2000-01-01"
    #  将每个招聘信息的详情页面的链接获取出来，最后只是拼接上id就可以获取全部招聘信息的内容
    Idlist = []
    for item in rows:
        Idlist.append(prefix + item['Id'] + tail)
    return Idlist

def Create_HTML(page):
    '''
    将爬取下来的公司的名称和url等信息构建每一个展示信息的html页面
    json.dumps()
    将数据转换为json格式
    将json格式的数据转化为python数据
    json.loads()
    '''
    data = json.loads(page)
    #  data 为大的字典，rows是招聘的信息数据
    rows = data["rows"]
    # 这里按照顺序构建html页面
    file_name = "./file/1.html"
    f = open(file_name, "a+")
    f.write('''<html><head><meta http-equiv="content-type" content="text/html;charset=utf-8"></head>\n''')
    i = 0
    for item in rows:
        if i % 22 == 0:
            f.write("</table>")
            f.write('''
                    <ul id="pagination-digg">
                    <li>页数</li>
                    <li><a href="./1.html">1</a></li>
                    <li><a href="./2.html">2</a></li>
                    <li><a href="./3.html">3</a></li>
                    <li><a href="./4.html">4</a></li>
                    <li><a href="./5.html">5</a></li>
                    <li><a href="./6.html">6</a></li>
                    <li><a href="./7.html">7</a></li>
                    <li><a href="./8.html">8</a></li>
                    <li><a href="./9.html">9</a></li>
                    <li><a href="./10.html">10</a></li>
                    <li><a href="./11.html">11</a></li>
                    <li><a href="./12.html">12</a></li>
                    <li><a href="./13.html">13</a></li>
                    <li><a href="./14.html">14</a></li>
                    <li><a href="./15.html">15</a></li>
                    <li><a href="./16.html">16</a></li>
                    <li><a href="./17.html">17</a></li>
                    <li><a href="./18.html">18</a></li>
                    <li><a href="./19.html">19</a></li>
                    <li><a href="./20.html">20</a></li>
                    <li><a href="./21.html">21</a></li>
                    <li><a href="./22.html">22</a></li>
                    <li><a href="./23.html">23</a></li>
                    <li><a href="./24.html">24</a></li>
                    <li><a href="./25.html">25</a></li>
                    <li><a href="./26.html">26</a></li>
                    <li><a href="./27.html">27</a></li>
                    <li><a href="./28.html">28</a></li>
                    <li><a href="./29.html">29</a></li>
                    <li><a href="./30.html">30</a></li>
                    <li><a href="./31.html">31</a></li>
                    <li><a href="./32.html">32</a></li>
                    <li><a href="./33.html">33</a></li>
                    <li><a href="./34.html">34</a></li>
                    <li><a href="./35.html">35</a></li>
                    <li><a href="./36.html">36</a></li>
                    <li><a href="./37.html">37</a></li>
                    <li><a href="./38.html">38</a></li>
                    <li>共38页</li>
                    </ul>
                    ''')
            f.write("</body>\n</html>")
            f.close()
            file_name = "./file/" + str(i/22 + 1)+".html"
            f = open(file_name, "a+")
            f.write('''<html><head><meta http-equiv="content-type" content="text/html;charset=utf-8">\n''')
            f.write('''
                    <style type="text/css" media="screen">
                    #pagination-digg li { border:0; margin:0; padding:0; font-size:11px; list-style:none; /* savers */ float:left;  }
                    #pagination-digg a { border:solid 1px #9aafe5; margin-right:2px;  }
                    #pagination-digg .previous-off,#pagination-digg .next-off  { border:solid 1px #DEDEDE; color:#888888;
                    display:block; float:left; font-weight:bold; margin-right:2px; padding:3px 4px; }
                    #pagination-digg .next a,#pagination-digg .previous a { font-weight:bold;  }
                    #pagination-digg .active { background:#2e6ab1; color:#FFFFFF; font-weight:bold; display:block; float:left;
                    padding:4px 6px; /* savers */ margin-right:2px; }
                    #pagination-digg a:link,#pagination-digg a:visited { color:#0e509e; display:block; float:left; padding:3px 6px; text-decoration:none; }
                    #pagination-digg a:hover { border:solid 1px #0e509e;  }
                    body { font-family:Arial, Helvetica, sans-serif; font-size:12px;  }
                    h2{ clear:both; border:0; margin:0; padding-top:30px; font-size:13px;  }
                    p{ border:0; margin:0; padding:0; padding-bottom:20px;  }
                    ul{ border:0; margin:0; padding:0;  }
                    </style>
                    </head> ''')
            f.write("</head>\n<body>")
            f.write('''<table border="1">\n''')
            f.write('''<thead>
                                    <tr>
                                        <th style="width: 50px">招聘单位</th>\n
                                        <th style="width: 50px">招聘岗位</th>\n
                                        <th style="width: 50px">招聘人数</th>\n
                                        <th style="width: 40px">发布时间</th>\n
                                     </tr>
                       </thead>''')
        # 因为有道页面中各项的信息为空，此时人为的将补充
        try:
            CompanyTitle = item["CompanyTitle"].encode("utf8")
        except Exception:
            CompanyTitle = "未知"
        try:
            WorkPositon = item["WorkPositon"].encode("utf8")
        except Exception:
            WorkPositon = "不限"
        try:
            EmployNum = item["EmployNum"].encode("utf8")
        except Exception:
            EmployNum = "未知"
        try:
            AddTime = item["AddTime"].encode("utf8")
        except Exception:
            AddTime = "未知"
        # 将公司的基本信息构造展示页面
        f.write("<tr>")
        f.write('''<td><a href="''' + '''./info/''' + str(i) + '''.html">''' + CompanyTitle + "</a></td>\n")
        f.write("<td>" + WorkPositon + "</td>\n")
        f.write("<td>" + EmployNum + "</td>\n")
        f.write("<td>" + AddTime + "</td>\n")
        f.write("</tr>")
        i = i + 1
    f.close()





def Create_INFO(urllist):
    j = 1
    for url in urllist:
        page = OpenPage(url)
        info = ParseDetailePage(page)
        file_path = "./file/info/" + str(j) + ".html"
        f = open(file_path, "a+")
        f.write('''<html><head><meta http-equiv="content-type" content="text/html;charset=utf-8"></head>\n''')
        try:
            CompanyTitle = info[1].encode("utf8")
        except Exception:
            CompanyTitle = "未知"
        try:
            WorkPositon = info[2].encode("utf8")
        except Exception:
            WorkPositon = "不限"
        try:
            Content = info[3].encode("utf8")
        except Exception:
            Content = "不限"
        f.write("<title>" + CompanyTitle + "</title>\n")
        f.write("<h2>" + CompanyTitle + "</h2>\n<br>")
        f.write('''<font size = "3"><strong>招聘单位：</strong></font><br> ''')
        f.write("<td><tr> " + WorkPositon + "</tr></td><br>")
        f.write('''<font size = "3"><strong>具体详情：</strong></font><br> ''')
        f.write("<td><tr> " + Content + "</tr></td>")
        f.close()
        j = j+1

def ParseDetailePage(page):
    '''
    每一个data为一个大的字典，我们需要拿到Data部分
    Data又是一个字典，里面有招聘详情
    我们需要获取到公司的名称，招聘岗位，详细信息
    '''
    # 将数据的格式转化为json格式
    data = json.loads(page)
    if data["Succeed"] == False:
        print "error"
        return
    data = data["Data"]
    detail = data["EmployContent"]
    soup = BeautifulSoup(detail, "html.parser")
    GetP = soup.find_all("p")  # 查找所有的p标签

    content = [item.get_text() for item in GetP]

    # for item in content:
    #     print item
    # 将每一个信息都用换行符连接
    content = "<br>\n".join(content)
    return data["Id"], data["CompanyTitle"], data["WorkPositon"], content

def Test4():
    # page = OpenPage("http://jy.51uns.com:8022/Frame/Data/jdp.ashx?rnd=1533005501840&fn=GetOneZhaopin&JobId=b360f9f177e34d94ba1363615aabda5f&StartDate=2000-01-01")
    # 发现在一个招聘详情里面的F12发现header里面也存在异步加载的数据，我们将url获取出来
    page = OpenPage("http://jy.51uns.com:8022/Frame/Data/jdp.ashx?rnd=1534220883227&fn=GetOneZhaopin&JobId=b7986ae4f3f94e0bb79677c68b871e30&StartDate=2000-01-01")
    jobid, title, position, content = ParseDetailePage(page)
    print jobid
    print title
    print position
    print content

# 写数据到文件
def WriteDataTofile(data):
    f = open("./info.html", "a+")
    f.write('''<html><head><meta http-equiv="content-type" content="text/html;charset=utf-8"></head>''')
    info = []
    info = "<h2>"+data[0]+"</h2>"
    for info in data:
        info = info.encode("utf-8")
        f.write(info)
    f.close()

def Test5():
    page = OpenPage("http://jy.51uns.com:8022/Frame/Data/jdp.ashx?rnd=1534220883227&fn=GetOneZhaopin&JobId=b7986ae4f3f94e0bb79677c68b871e30&StartDate=2000-01-01")
    data = ParseDetailePage(page)
    WriteDataTofile(data)


def WriteDataToMySql(data):
    db = MySQLdb.connect(host="localhost", user="root", passwd="1", db="Job", charset="utf8")
    cursor = db.cursor()
    # base64 编码可以保证将中文和特殊字符的内容存进数据库

    # content_0 = base64.b64encode(data[0])
    # content_1 = base64.b64encode(data[1])
    # content_2 = base64.b64encode(data[2])
    # content_3 = base64.b64encode(data[3])

    content_0 = data[0].encode("utf-8")
    content_1 = data[1].encode("utf-8")
    content_2 = data[2].encode("utf-8")
    content_3 = data[3].encode("utf-8")

    # 构建sql语句
    sql = "insert into ClawerSchool values('%s', '%s', '%s', '%s');" % (content_0, content_1, content_2, content_3)
    print "sql = " + sql
    try:
        # 执行sql语句
        cursor.execute(sql)
        db.commit()
    except Exception:
        # 插入失败时，为了保证数据库的原子性，出错进行回滚
        db.rollback()

if __name__ == "__main__":
    #  注意下面可以直接修改limite= 65*15，就可以将所有是数据加载
    page = OpenPage("http://jy.51uns.com:8022/Frame/Data/jdp.ashx?rnd=1534164775674&fn=GetZhaopinList&StartDate=2000-01-01&SearchKey=&InfoType=-1&CompanyAttr=&CompanyType=&Area=&City=&CompanyProvice=&Post=&Zhuanye=&XLkey=&Age=&start=0&limit=975&DateType=999&InfoState=1&WorkType=0&CompanyKey=")
    Create_HTML(page)
    # 构建一次就可以了
    urllist = ParseMainPage(page)
    # 构建每一个公司的详情页面
    Create_INFO(urllist)
    # print rows
#     url = "http://jy.51uns.com:8022/Frame/Data/jdp.ashx?rnd=1534224352671&fn=GetZhaopinList&StartDate=2000-01-01&SearchKey=&InfoType=-1&CompanyAttr=&CompanyType=&Area=&City=&CompanyProvice=&Post=&Zhuanye=&XLkey=&Age=&start=0&limit=975&DateType=999&InfoState=1&WorkType=0&CompanyKey="
#     # 根据主页ajax获取数据的url，获取服务器端的信息
#     mainPage = OpenPage(url)
#     # 分析服务器端的响应，得到招聘信息详情页的数据获取url
#     urlList = ParseMainPage(mainPage)
#     for item in urlList:
#         print "crawler url =" + item
#         detailPage = OpenPage(item)
#         data = ParseDetailePage(detailPage)
#         WriteDataTofile(data)
#     #    WriteDataToMySql(data)
#     print "crawler done"
#
